#### Supplementary Material for "Holding Ground...."
## Difference in Discontinuity Analysis
## Authors: Kathryn Baragwanath, Ella Bayi, and Guilherme N. Fasolin
##### Date: November 17, 2025

## Loading Packages
library(tidyverse)
library(data.table)
library(lubridate)
library(rdrobust)
library(fixest)
library(vtable)   
library(ggplot2) 
library(knitr)
library(sandwich)

rm(list = ls())
gc()

#Set working directory here
wd <- ""

### Load PAs data -------------------
df_pas <- readRDS(file.path(wd, "df_pas.rds"))
df_pas <- subset(df_pas, year >= 2009 & year <= 2022)

# ----------------------------------------------------------------------------- #
### Estimating Differences-in-Discontinuity – ITs - Full Sample ---- Tables S12 & S13
# ----------------------------------------------------------------------------- #
# Adjust distance variable: negative outside IT, positive inside
df_pas$near_it <- ifelse(df_pas$grid_is_it == 0, df_pas$near_it*(-1), df_pas$near_it)

# Group the dataframe by spatial ID (near_it_FID) and year
p_it <- df_pas %>%    
        group_by(near_it_FID, year) 

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the IT was homologated
# 3. Keep observations within ±12km of the IT boundary (running variable window)
p_it <- subset(p_it, !is.na(near_it_hom_year) & year >= near_it_hom_year & 
                 near_it >= -12000 & near_it <= 12000)

# Create spatial treatment indicator: 
# 1 if inside the IT (positive near_it) and within the 0–12km window
p_it$rdd <- ifelse(p_it$near_it > 0 & p_it$near_it <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_it$post_2012 <- ifelse(p_it$year > 2011 & p_it$year < 2023, 1, 0)
p_it$post_2018 <- ifelse(p_it$year > 2018 & p_it$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than ITs (e.g., SPs or SUs)
p_it$deforestation_pct <- ifelse(p_it$grid_is_sp & p_it$grid_is_su == 1, NA, p_it$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_it$deforestation_pct, x = p_it$near_it, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_it  <- p_it  %>%
  filter(abs(near_it) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_it) / h))

# Differences-in-Discontinuities regression for post-2012
r_it_2012 <- feols(deforestation_pct ~ rdd * post_2012 | near_it_FID + year, data = p_it,
              cluster = ~near_it_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel A - Column 1
etable(r_it_2012)

# Differences-in-Discontinuities regression for post-2018
r_it_2018 <- feols(deforestation_pct ~ rdd * post_2018 | near_it_FID + year, data = p_it,
                 cluster = ~near_it_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel A - Column 1
etable(r_it_2018)

#------------------------------------------------------------------------------#
### Estimating Differences-in-Discontinuity – ITs - Arc of Deforestation ---- Tables S12 & S13
#-------------------------------------------------------------------------------
# Group the dataframe by spatial ID (near_it_FID) and year
p_it_arc <- df_pas %>%
  filter(sigla_uf  %in% c("PA", "MT", "RO", "AC")) %>%
  group_by(near_it_FID, year) %>%  
  ungroup()
  
# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the IT was homologated
# 3. Keep observations within ±12km of the IT boundary (running variable window)
p_it_arc <- subset(p_it_arc, !is.na(near_it_hom_year) & year >= near_it_hom_year & 
                 near_it >= -12000 & near_it <= 12000)

# Create spatial treatment indicator:
# 1 if inside the IT (positive near_it) and within the 0–12km window
p_it_arc$rdd <- ifelse(p_it_arc$near_it > 0 & p_it_arc $near_it <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_it_arc$post_2012 <- ifelse(p_it_arc$year > 2011 & p_it_arc$year < 2023, 1, 0)
p_it_arc$post_2018 <- ifelse(p_it_arc$year > 2018 & p_it_arc$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than ITs (e.g., SPs or SUs)
p_it_arc$deforestation_pct <- ifelse(p_it_arc$grid_is_sp & p_it_arc$grid_is_su == 1, NA, p_it_arc$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_it_arc$deforestation_pct, x = p_it_arc$near_it, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_it_arc  <- p_it_arc  %>%
  filter(abs(near_it) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_it) / h))

# Differences-in-Discontinuities regression for post-2012
r_it_2012_arc <- feols(deforestation_pct ~ rdd * post_2012 | near_it_FID + year, data = p_it_arc,
                   cluster = ~near_it_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel A - Column 2
etable(r_it_2012_arc)

# Differences-in-Discontinuities regression for post-2018
r_it_2018_arc <- feols(deforestation_pct ~ rdd * post_2018 | near_it_FID + year, data = p_it_arc,
                   cluster = ~near_it_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel A - Column 2
etable(r_it_2018_arc)

#------------------------------------------------------------------------------#
### Estimating Differences-in-Discontinuity – ITs - Non-Arc of Deforestation ---- Tables S12 & S13
#-------------------------------------------------------------------------------
# Group the dataframe by spatial ID (near_it_FID) and year
p_it_nonarc <- df_pas %>%
  filter(!sigla_uf  %in% c("PA", "MT", "RO", "AC")) %>%
  group_by(near_it_FID, year) %>%  
  ungroup()

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the IT was homologated
# 3. Keep observations within ±12km of the IT boundary (running variable window)
p_it_nonarc <- subset(p_it_nonarc, !is.na(near_it_hom_year) & year >= near_it_hom_year & 
                     near_it >= -12000 & near_it <= 12000)

# Create a spatial treatment indicator:
# 1 if inside the IT (positive near_it) and within the 0–12km window
p_it_nonarc$rdd <- ifelse(p_it_nonarc$near_it > 0 & p_it_nonarc$near_it <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_it_nonarc$post_2012 <- ifelse(p_it_nonarc$year > 2011 & p_it_nonarc$year < 2023, 1, 0)
p_it_nonarc$post_2018 <- ifelse(p_it_nonarc$year > 2018 & p_it_nonarc$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than ITs (e.g., SPs or SUs)
p_it_nonarc$deforestation_pct <- ifelse(p_it_nonarc$grid_is_sp & p_it_nonarc$grid_is_su == 1, NA, 
                                        p_it_nonarc$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_it_nonarc$deforestation_pct, x = p_it_nonarc$near_it, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_it_nonarc  <- p_it_nonarc  %>%
  filter(abs(near_it) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_it) / h))

# Differences-in-Discontinuities regression for post-2012
r_it_2012_nonarc <- feols(deforestation_pct ~ rdd * post_2012 | near_it_FID + year, data = p_it_nonarc,
                       cluster = ~near_it_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel A - Column 3
etable(r_it_2012_nonarc)

# Differences-in-Discontinuities regression for post-2018
r_it_2018_nonarc <- feols(deforestation_pct ~ rdd * post_2018 | near_it_FID + year, data = p_it_nonarc,
                       cluster = ~near_it_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel A - Column 3
etable(r_it_2018_nonarc)

#------------------------------------------------------------------------------#
### Tabular Formal – ITs ---- Tables S12 & S13
#-------------------------------------------------------------------------------
## Tabular Format - 2012 (Table S11, Panel A)
etable(
  r_it_2012, 
  r_it_2012_arc, 
  r_it_2012_nonarc,
  dict = c(
    "rdd" = "Inside",
    "rdd:post_2018" = "Inside × Post-2018",
    "deforestation_pct" = "Deforestation (%)"
  ),
  depvar = TRUE,
  headers = c("Full Sample", "Arc of Def", "Non-Arc of Def"),
  title = "Table S9 Panel A — Dependent Variable: Deforestation (%)"
)

## Tabular Format - 2012 (Table S12, Panel A)
etable(
  r_it_2018, 
  r_it_2018_arc, 
  r_it_2018_nonarc,
  dict = c(
    "rdd" = "Inside",
    "rdd:post_2018" = "Inside × Post-2018",
    "deforestation_pct" = "Deforestation (%)"
  ),
  depvar = TRUE,
  headers = c("Full Sample", "Arc of Def", "Non-Arc of Def"),
  title = "Table S9 Panel A — Dependent Variable: Deforestation (%)"
)


#------------------------------------------------------------------------------#
### # - Linear Hypothesis Tests  – ITs -- Table S14 - Panel A ----
#-------------------------------------------------------------------------------
# Function to do linear hypothesis test + compute total effect + CI + interpretation
test_and_total_effect <- function(model, base, interaction, cluster_var) {
    # Extract cluster IDs
    cluster_ids <- model$model[[cluster_var]]
    
    # Get clustered vcov matrix
    vcov_clust <- vcovCL(model, cluster = cluster_ids)
    
    # Coefs and names
    coefs <- coef(model)
    coef_names <- names(coefs)
    
    # Make sure terms exist
    if (!(base %in% coef_names)) stop(paste("Base term", base, "not in coefficients"))
    if (!(interaction %in% coef_names)) stop(paste("Interaction term", interaction, "not in coefficients"))
    
    # Construct hypothesis matrix L for Wald test: sum of base + interaction = 0
    L <- rep(0, length(coefs))
    names(L) <- coef_names
    L[base] <- 1
    L[interaction] <- 1
    
    # Compute estimate and SE
    est <- sum(L * coefs)
    var <- as.numeric(t(L) %*% vcov_clust %*% L)
    se <- sqrt(var)
    
    # Wald test statistic (chi-square with df=1)
    wald_stat <- (est^2) / var
    pval <- 1 - pchisq(wald_stat, df = 1)
    
    # 95% CI
    ci_low <- est - 1.96 * se
    ci_high <- est + 1.96 * se
    
    # Interpretation
    interpret <- ifelse(pval < 0.05, "Reject null — significant", "Fail to reject null")
    
    # Return a data.frame row
    data.frame(
      Estimate = round(est, 4),
      Std_Error = round(se, 4),
      CI_Lower = round(ci_low, 4),
      CI_Upper = round(ci_high, 4),
      Wald_ChiSq = round(wald_stat, 3),
      P_value = round(pval, 4),
      Interpretation = interpret,
      row.names = NULL
    )
  }

# Storing all models in a named list
models <- list(
  r_it_2012 = r_it_2012,
  r_it_2018 = r_it_2018,
  r_it_2012_arc = r_it_2012_arc,
  r_it_2018_arc = r_it_2018_arc,
  r_it_2012_nonarc = r_it_2012_nonarc,
  r_it_2018_nonarc = r_it_2018_nonarc
)

# Storing corresponding hypotheses for each model
hypotheses <- list(
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018"),
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018"),
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018")
)

# This is the variable used for cluster-robust standard errors
cluster_var <- "near_it_FID"

# Initializing a list to store results from each mode
results_list <- list()

# Loop through all models and hypotheses
for(i in seq_along(models)) {
  mod_name <- names(models)[i]
  model <- models[[i]]
  base <- hypotheses[[i]][1]
  interaction <- hypotheses[[i]][2]
  
  res <- test_and_total_effect(model, base, interaction, cluster_var)
  res$Model <- mod_name
  res$Hypothesis <- paste0(base, " + ", interaction, " = 0")
  
  results_list[[i]] <- res[, c("Model", "Hypothesis", names(res)[1:7])]
}

# Combining all results into a single data frame
results_ht_it <- do.call(rbind, results_list)

##  Rename Model and Hypothesis labels (Table S9 - Panel A)
model_names_map <- c(
  r_it_2012 = "Post 2012",
  r_it_2018 = "Post 2018",
  r_it_2012_arc = "Arc - Post-2012",
  r_it_2018_arc = "Arc - Post-2018",
  r_it_2012_nonarc = "Non-Arc - Post-2012",
  r_it_2018_nonarc = "Non-Arc - Post-2018"
)


hypothesis_names_map <- c(
  "rdd + rdd:post_2012 = 0" = "B1 + B3 = 0",
  "rdd + rdd:post_2018 = 0" = "B1 + B3 = 0"
)

# Replace Model names in the results table
results_ht_it$Model <- model_names_map[results_ht_it$Model]
results_ht_it$Hypothesis <- hypothesis_names_map[results_ht_it$Hypothesis]

# View the renamed results - Table S9 - IT - Panel A (Model)
print(results_ht_it)

#------------------------------------------------------------------------------#
### # - ### Robustness Check - Pre-Trends ----  – ITs -- Figure S18 ----
#-------------------------------------------------------------------------------
# Computing mean deforestation by year and treatment status
# 'it_hom_inside' indicates whether the area is inside or outside the SP boundary
mean_by_year <- p_it %>%
  group_by(year, rdd) %>%
  summarise(mean_estimate = mean(deforestation_pct, na.rm = TRUE), .groups = "drop")

# Creating a descriptive label for each group (inside vs. outside IT)
mean_by_year$group <- ifelse(mean_by_year$rdd == 1, "Inside IT", "Outside IT")

# Plotting mean deforestation trends over time for both groups
fig_trends_it <- ggplot(mean_by_year, aes(x = year, y = mean_estimate, color = group)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  # Vertical lines for key years
  geom_vline(xintercept = 2012, linetype = "dashed", color = "red") +
  geom_vline(xintercept = 2018, linetype = "dashed", color = "red") +
 annotate("text", x = 2012, y = max(mean_by_year$mean_estimate) * 0.8,
          label = "Forest Code", angle = 90, vjust = -0.5, size = 4.5, color = "red") +
  annotate("text", x = 2019, y = max(mean_by_year$mean_estimate) * 0.73,
          label = "Bolsonaro Elected", angle = 90, vjust = -0.5, size = 4.5, color = "red") +
  scale_x_continuous(
    breaks = seq(min(mean_by_year$year), max(mean_by_year$year), by = 3),
    labels = as.character(seq(min(mean_by_year$year), max(mean_by_year$year), by = 3))
  ) +
  ylab(NULL) +
  xlab(NULL) +
  labs(
    title = "Deforestation Trends",
   x = "Year",
   y = "Mean Deforestation (%)",
    color = "Location"
  ) +
  scale_color_manual(values = c("Inside IT" = "black", "Outside IT" = "darkgray")) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 13, margin = margin(b = 10)),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_text(size = 13),
    legend.text = element_text(size = 12),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_line(color = "grey90")
  )

fig_trends_it

# ----------------------------------------------------------------------------- #
### Estimating Differences-in-Discontinuity – SPs - Full Sample ---- Tables S12 & S13
# ---------------------------------------------------------------- --------


# Create 'near_pi' containing 'near_pa' values only for SP areas; others set to NA
df_pas$near_sp <- ifelse(df_pas$near_pa_type == "Proteção Integral", df_pas$near_pa, 
                         as.numeric(NA))

## Keep only the federal-level protected areas 
df_pas <- df_pas %>% filter(near_pa_gov == "Federal")

# Adjust distance variable: negative outside IT, positive inside
df_pas$near_sp <- ifelse(df_pas$grid_is_sp == 0, df_pas$near_sp*(-1), df_pas$near_sp)

# Group the dataframe by spatial ID (near_pa_FID) and year
p_sp <- df_pas %>%
  group_by(near_pa_FID, year) 

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the SP was homologated
# 3. Keep observations within ±12km of the SP boundary (running variable window)
p_sp <- subset(p_sp, !is.na(near_pa_ano) & year >= near_pa_ano &
                 near_sp >= -12000 & near_sp <= 12000)

# Create a spatial treatment indicator:
# 1 if inside the SP (positive near_sp) and within the 0–12km window
p_sp$rdd <- ifelse(p_sp$near_sp > 0 & p_sp$near_sp <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_sp$post_2012 <- ifelse(p_sp$year > 2011 & p_sp$year < 2023, 1, 0)
p_sp$post_2018 <- ifelse(p_sp$year > 2018 & p_sp$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than SPs (e.g., ITs or SUs)
p_sp$deforestation_pct <- ifelse(p_sp$grid_is_it & p_sp$grid_is_su == 1, NA, p_sp$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_sp$deforestation_pct, x = p_sp$near_sp, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_sp <- p_sp %>%
  filter(abs(near_sp) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_sp) / h))

# Differences-in-Discontinuities regression for post-2012
r_sp_2012 <- feols(deforestation_pct ~ rdd * post_2012 | year + near_pa_FID, data = p_sp,
              cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel B - Column 1
etable(r_sp_2012)

# Differences-in-Discontinuities regression for post-2018
r_sp_2018 <- feols(deforestation_pct ~ rdd * post_2018 | near_pa_FID + year, data = p_sp,
                   cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel B - Column 1
etable(r_sp_2018)

# ----------------------------------------------------------------------------- #
### ### Estimating Differences-in-Discontinuity – SPs - Arc of Deforestation ---- Tables S12 & S13
#--------------------------------------------------------------- --------


# Group the dataframe by spatial ID (near_pa_FID) and year
p_sp_arc <- df_pas %>%
  filter(sigla_uf  %in% c("PA", "MT", "RO", "AC")) %>%
  group_by(near_pa_FID, year) %>%
  ungroup()

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the SP was homologated
# 3. Keep observations within ±12km of the SP boundary (running variable window)
p_sp_arc <- subset(p_sp_arc, !is.na(near_pa_ano) & year >= near_pa_ano &
                 near_sp >= -12000 & near_sp <= 12000)

# Create a spatial treatment indicator:
# 1 if inside the SP (positive near_sp) and within the 0–12km window
p_sp_arc$rdd <- ifelse(p_sp_arc$near_sp > 0 & p_sp_arc$near_sp <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_sp_arc$post_2012 <- ifelse(p_sp_arc$year > 2011 & p_sp_arc$year < 2023, 1, 0)
p_sp_arc$post_2018 <- ifelse(p_sp_arc$year > 2018 & p_sp_arc$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than SPs (e.g., ITs or SUs)
p_sp_arc$deforestation_pct <- ifelse(p_sp_arc$grid_is_it & p_sp_arc$grid_is_su == 1, NA, 
                                     p_sp_arc$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_sp_arc$deforestation_pct, x = p_sp_arc$near_sp, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_sp_arc <- p_sp_arc %>%
  filter(abs(near_sp) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_sp) / h))

# Differences-in-Discontinuities regression for post-2012
r_sp_2012_arc <- feols(deforestation_pct ~ rdd * post_2012 | year + near_pa_FID, data = p_sp_arc,
                   cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel B - Column 2
etable(r_sp_2012_arc)

# Differences-in-Discontinuities regression for post-2018
r_sp_2018_arc <- feols(deforestation_pct ~ rdd * post_2018 | near_pa_FID + year, data = p_sp_arc,
                   cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel B - Column 2
etable(r_sp_2018_arc)

# ----------------------------------------------------------------------------- #
### ### Estimating Differences-in-Discontinuity – SPs - Non-Arc of Deforestation ---- Tables S12 & S13
# # -------------------------------------------------------------- --------


# Group the dataframe by spatial ID (near_pa_FID) and year
p_sp_nonarc <- df_pas %>%
  filter(!sigla_uf  %in% c("PA", "MT", "RO", "AC")) %>%
  group_by(near_pa_FID, year) %>%
  ungroup()

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the SP was homologated
# 3. Keep observations within ±12km of the SP boundary (running variable window)
p_sp_nonarc <- subset(p_sp_nonarc, !is.na(near_pa_ano) & year >= near_pa_ano &
                     near_sp >= -12000 & near_sp <= 12000)

# Create a spatial treatment indicator:
# 1 if inside the SP (positive near_sp) and within the 0–12km window
p_sp_nonarc$rdd <- ifelse(p_sp_nonarc$near_sp > 0 & p_sp_nonarc$near_sp <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_sp_nonarc$post_2012 <- ifelse(p_sp_nonarc$year > 2011 & p_sp_nonarc$year < 2023, 1, 0)
p_sp_nonarc$post_2018 <- ifelse(p_sp_nonarc$year > 2018 & p_sp_nonarc$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than SPs (e.g., ITs or SUs)
p_sp_nonarc$deforestation_pct <- ifelse(p_sp_nonarc$grid_is_it & p_sp_nonarc$grid_is_su == 1, NA, 
                                        p_sp_nonarc$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_sp_nonarc$deforestation_pct, x = p_sp_nonarc$near_sp, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_sp_nonarc <- p_sp_nonarc %>%
  filter(abs(near_sp) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_sp) / h))

# Differences-in-Discontinuities regression for post-2012
r_sp_2012_nonarc <- feols(deforestation_pct ~ rdd * post_2012 | year + near_pa_FID, data = p_sp_nonarc,
                       cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel B - Column 3
etable(r_sp_2012_nonarc)

# Differences-in-Discontinuities regression for post-2018
r_sp_2018_nonarc <- feols(deforestation_pct ~ rdd * post_2018 | near_pa_FID + year, data = p_sp_nonarc,
                          cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel B - Column 3
etable(r_sp_2018_nonarc)

#------------------------------------------------------------------------------#
### Tabular Formal – SPs ---- Table S11 and S12
-------------------------------------------------------------------------------
## Tabular Format - 2012 (Table S8, Panel B)
etable(
    r_sp_2012, 
    r_sp_2012_arc, 
    r_sp_2012_nonarc,
    dict = c(
      "rdd" = "Inside",
      "rdd:post_2018" = "Inside × Post-2018",
      "deforestation_pct" = "Deforestation (%)"
    ),
    depvar = TRUE,
    headers = c("Full Sample", "Arc of Def", "Non-Arc of Def"),
    title = "Table S9 Panel B — Dependent Variable: Deforestation (%)",
    notes = "\\textbf{Significance levels}: ***: $p<0.01$, **: $p<0.05$, *: $p<0.1$."
  )

## Tabular Format - 2012 (Table S8, Panel B)
etable(
  r_sp_2018, 
  r_sp_2018_nonarc, 
  r_sp_2018_nonarc,
  dict = c(
    "rdd" = "Inside",
    "rdd:post_2018" = "Inside × Post-2018",
    "deforestation_pct" = "Deforestation (%)"
  ),
  depvar = TRUE,
  headers = c("Full Sample", "Arc of Def", "Non-Arc of Def"),
  title = "Table S9 Panel B — Dependent Variable: Deforestation (%)",
  notes = "\\textbf{Significance levels}: ***: $p<0.01$, **: $p<0.05$, *: $p<0.1$."
)


#------------------------------------------------------------------------------#
### # - Linear Hypothesis Tests  – SPs -- Table S14 - Panel B ----
#-------------------------------------------------------------------------------
# Function to do linear hypothesis test + compute total effect + CI + interpretation
test_and_total_effect <- function(model, base, interaction, cluster_var) {
    # Extract cluster IDs
    cluster_ids <- model$model[[cluster_var]]
    
    # Get clustered vcov matrix
    vcov_clust <- vcovCL(model, cluster = cluster_ids)
    
    # Coefs and names
    coefs <- coef(model)
    coef_names <- names(coefs)
    
    # Make sure terms exist
    if (!(base %in% coef_names)) stop(paste("Base term", base, "not in coefficients"))
    if (!(interaction %in% coef_names)) stop(paste("Interaction term", interaction, "not in coefficients"))
    
    # Construct hypothesis matrix L for Wald test: sum of base + interaction = 0
    L <- rep(0, length(coefs))
    names(L) <- coef_names
    L[base] <- 1
    L[interaction] <- 1
    
    # Compute estimate and SE
    est <- sum(L * coefs)
    var <- as.numeric(t(L) %*% vcov_clust %*% L)
    se <- sqrt(var)
    
    # Wald test statistic (chi-square with df=1)
    wald_stat <- (est^2) / var
    pval <- 1 - pchisq(wald_stat, df = 1)
    
    # 95% CI
    ci_low <- est - 1.96 * se
    ci_high <- est + 1.96 * se
    
    # Interpretation
    interpret <- ifelse(pval < 0.05, "Reject null — significant", "Fail to reject null")
    
    # Return a data.frame row
    data.frame(
      Estimate = round(est, 4),
      Std_Error = round(se, 4),
      CI_Lower = round(ci_low, 4),
      CI_Upper = round(ci_high, 4),
      Wald_ChiSq = round(wald_stat, 3),
      P_value = round(pval, 4),
      Interpretation = interpret,
      row.names = NULL
    )
  }

# Storing all models in a named list
models <- list(
  r_sp_2012 = r_sp_2012,
  r_sp_2018 = r_sp_2018,
  r_sp_2012_arc = r_sp_2012_arc,
  r_sp_2018_arc = r_sp_2018_arc,
  r_sp_2012_nonarc = r_sp_2012_nonarc,
  r_sp_2018_nonarc = r_sp_2018_nonarc
)

# Storing corresponding hypotheses for each model
hypotheses <- list(
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018"),
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018"),
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018")
)

# This is the variable used for cluster-robust standard errors
cluster_var <- "near_pa_FID"

# Initializing a list to store results from each mode
results_list <- list()

# Loop through all models and hypotheses
for(i in seq_along(models)) {
  mod_name <- names(models)[i]
  model <- models[[i]]
  base <- hypotheses[[i]][1]
  interaction <- hypotheses[[i]][2]
  
  res <- test_and_total_effect(model, base, interaction, cluster_var)
  res$Model <- mod_name
  res$Hypothesis <- paste0(base, " + ", interaction, " = 0")
  
  results_list[[i]] <- res[, c("Model", "Hypothesis", names(res)[1:7])]
}

# Combining all results into a single data frame
results_ht_sp <- do.call(rbind, results_list)

##  Rename Model and Hypothesis labels (Table S9 - Panel B)
model_names_map <- c(
  r_sp_2012 = "Post 2012",
  r_sp_2018 = "Post 2018",
  r_sp_2012_arc = "Arc - Post-2012",
  r_sp_2018_arc = "Arc - Post-2018",
  r_sp_2012_nonarc = "Non-Arc - Post-2012",
  r_sp_2018_nonarc = "Non-Arc - Post-2018"
)

hypothesis_names_map <- c(
  "rdd + rdd:post_2012 = 0" = "B1 + B3 = 0",
  "rdd + rdd:post_2018 = 0" = "B1 + B3 = 0"
)

# Replace Model names in the results table
results_ht_sp$Model <- model_names_map[results_ht_sp$Model]
results_ht_sp$Hypothesis <- hypothesis_names_map[results_ht_sp$Hypothesis]

# View the renamed results - Table S9 - SP - Panel B
print(results_ht_sp)


#------------------------------------------------------------------------------#
### - ### Robustness Check - Pre-Trends ----  SPs -- Figure S18 ----
#-------------------------------------------------------------------------------
# Computing mean deforestation by year and treatment status
# 'sp_hom_inside' indicates whether the area is inside or outside the SP boundary

mean_by_year <- p_sp %>%
              group_by(year, rdd) %>%
              summarise(mean_estimate = mean(deforestation_pct, na.rm = TRUE),  # mean deforestation rate
                        .groups = "drop")

# Creating a descriptive label for each group (inside vs. outside SP)
mean_by_year$group <- ifelse(mean_by_year$rdd == 1, "Inside SP", "Outside SP")

# Plotting mean deforestation trends over time for both groups
fig_trends_sp <- ggplot(mean_by_year, aes(x = year, y = mean_estimate, color = group)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  # Vertical lines for key years
  geom_vline(xintercept = 2012, linetype = "dashed", color = "red") +
  geom_vline(xintercept = 2018, linetype = "dashed", color = "red") +
  annotate("text", x = 2011.5, y = max(mean_by_year$mean_estimate) * 0.7,
           label = "Forest Code", angle = 90, vjust = 1.5, size = 4, color = "red") +
  annotate("text", x = 2017.5, y = max(mean_by_year$mean_estimate) * 0.6,
           label = "Bolsonaro Elected", angle = 90, vjust = 1.5, size = 4, color = "red") +
  scale_x_continuous(
    breaks = seq(min(mean_by_year$year), max(mean_by_year$year), by = 3),
    labels = as.character(seq(min(mean_by_year$year), max(mean_by_year$year), by = 3))
  ) +
  ylab(NULL) +
  xlab(NULL) +
  labs(
    title = "Deforestation Trends",
    x = "Year",
    y = "Mean Deforestation (%)",
    color = "Location"
  ) +
  scale_color_manual(values = c("Inside SP" = "black", "Outside SP" = "darkgray")) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 13, margin = margin(b = 10)),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_text(size = 13),
    legend.text = element_text(size = 12),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_line(color = "grey90")
  )

fig_trends_sp

# ----------------------------------------------------------------------------- #
### Estimating Differences-in-Discontinuity – SUs - Full Sample ---- Tables S12 & S13
#--------------------------------------------------------------- --------


# Create 'near_su' containing 'near_pa' values only for SU areas; others set to NA
df_pas$near_su <- ifelse(df_pas$near_pa_type == "Uso Sustentável", df_pas$near_pa, 
                         as.numeric(NA))

# Adjust distance variable: negative outside IT, positive inside
df_pas$near_su <- ifelse(df_pas$grid_is_su == 0, df_pas$near_su*(-1), df_pas$near_su)

# Group the dataframe by spatial ID (near_pa_FID) and year
p_su <- df_pas %>%
  group_by(near_pa_FID, year) 

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the SP was homologated
# 3. Keep observations within ±12km of the SP boundary (running variable window)
p_su <- subset(p_su, !is.na(near_pa_ano) & year >= near_pa_ano &
                 near_su >= -12000 & near_su <= 12000)

# Create a spatial treatment indicator:
# 1 if inside the SP (positive near_sp) and within the 0–12km window
p_su$rdd <- ifelse(p_su$near_su > 0 & p_su$near_su <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_su$post_2012 <- ifelse(p_su$year > 2011 & p_su$year < 2023, 1, 0)
p_su$post_2018 <- ifelse(p_su$year > 2018 & p_su$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than SUs (e.g., ITs or SPs)
p_su$deforestation_pct <- ifelse(p_su$grid_is_it & p_su$grid_is_sp == 1, NA, p_su$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_su$deforestation_pct, x = p_su$near_su, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_su <- p_su %>%
  filter(abs(near_su) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_su) / h))

# Differences-in-Discontinuities regression for post-2012
r_su_2012 <- feols(deforestation_pct ~ rdd * post_2012 | year + near_pa_FID, data = p_su,
                   cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel C - Column 1
etable(r_su_2012)

# Differences-in-Discontinuities regression for post-2018
r_su_2018 <- feols(deforestation_pct ~ rdd * post_2018 | near_pa_FID + year, data = p_su,
                   cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel C - Column 1
etable(r_su_2018)

# ----------------------------------------------------------------------------- #
### Estimating Differences-in-Discontinuity – SUs - Arc of Deforestation ---- Tables S12 & S13
# ----------------------------------------------------------------------------- #
# Group the dataframe by spatial ID (near_pa_FID) and year
p_su_arc <- df_pas %>%
  filter(sigla_uf  %in% c("PA", "MT", "RO", "AC")) %>%
  group_by(near_pa_FID, year) %>%
  ungroup()

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the SP was homologated
# 3. Keep observations within ±12km of the SP boundary (running variable window)
p_su_arc <- subset(p_su_arc, !is.na(near_pa_ano) & year >= near_pa_ano &
                 near_su >= -12000 & near_su <= 12000)

# Create a spatial treatment indicator:
# 1 if inside the SP (positive near_sp) and within the 0–12km window
p_su_arc$rdd <- ifelse(p_su_arc$near_su > 0 & p_su_arc$near_su <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_su_arc$post_2012 <- ifelse(p_su_arc$year > 2011 & p_su_arc$year < 2023, 1, 0)
p_su_arc$post_2018 <- ifelse(p_su_arc$year > 2018 & p_su_arc$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than SUs (e.g., ITs or SPs)
p_su_arc$deforestation_pct <- ifelse(p_su_arc$grid_is_it & p_su_arc$grid_is_sp == 1, NA, p_su_arc$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_su_arc$deforestation_pct, x = p_su_arc$near_su, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_su_arc <- p_su_arc %>%
  filter(abs(near_su) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_su) / h))

# Differences-in-Discontinuities regression for post-2012
r_su_2012_arc <- feols(deforestation_pct ~ rdd * post_2012 | year + near_pa_FID, data = p_su_arc,
                   cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel C - Column 2
etable(r_su_2012_arc)

# Differences-in-Discontinuities regression for post-2018
r_su_2018_arc <- feols(deforestation_pct ~ rdd * post_2018 | near_pa_FID + year, data = p_su_arc,
                   cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel C - Column 2
etable(r_su_2018_arc)

# ----------------------------------------------------------------------------- #
### Estimating Differences-in-Discontinuity – SUs - Non-Arc of Deforestation ---- Tables S12 & S13
# ----------------------------------------------------------------------------- #
# Group the dataframe by spatial ID (near_pa_FID) and year
p_su_nonarc <- df_pas %>%
  filter(!sigla_uf  %in% c("PA", "MT", "RO", "AC")) %>%
  group_by(near_pa_FID, year) %>%
  ungroup()

# Subset observations for the analysis:
# 1. Remove rows with missing intervention year
# 2. Keep only years after the SP was homologated
# 3. Keep observations within ±12km of the SP boundary (running variable window)
p_su_nonarc <- subset(p_su_nonarc, !is.na(near_pa_ano) & year >= near_pa_ano &
                     near_su >= -12000 & near_su <= 12000)

# Create a spatial treatment indicator:
# 1 if inside the SP (positive near_sp) and within the 0–12km window
p_su_nonarc$rdd <- ifelse(p_su_nonarc$near_su > 0 & p_su_nonarc$near_su <= 12000, 1, 0)

# Create post-treatment indicators for different policy periods
p_su_nonarc$post_2012 <- ifelse(p_su_nonarc$year > 2011 & p_su_nonarc$year < 2023, 1, 0)
p_su_nonarc$post_2018 <- ifelse(p_su_nonarc$year > 2018 & p_su_nonarc$year < 2023, 1, 0)

# Remove deforestation observations that are in protected areas other than SUs (e.g., ITs or SPs)
p_su_nonarc$deforestation_pct <- ifelse(p_su_nonarc$grid_is_it & p_su_nonarc$grid_is_sp == 1, NA, 
                                        p_su_nonarc$deforestation_pct)

# Select optimal bandwidth for RDD using the "mserd" method
bandwidth <- rdbwselect(y = p_su_nonarc$deforestation_pct, x = p_su_nonarc$near_su, bwselect = "mserd")
h <- bandwidth$bws[1]

# Filter to observations within the chosen bandwidth and compute triangular weights
p_su_nonarc <- p_su_nonarc %>%
  filter(abs(near_su) <= h) %>%
  mutate(weights = pmax(0, 1 - abs(near_su) / h))

# Differences-in-Discontinuities regression for post-2012
r_su_2012_nonarc <- feols(deforestation_pct ~ rdd * post_2012 | year + near_pa_FID, data = p_su_nonarc,
                       cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S11 - Panel C - Column 3
etable(r_su_2012_nonarc)

# Differences-in-Discontinuities regression for post-2018
r_su_2018_nonarc <- feols(deforestation_pct ~ rdd * post_2018 | near_pa_FID + year, data = p_su_nonarc,
                          cluster = ~near_pa_FID, weights = ~weights)

# Display regression table -- Table S12 - Panel C - Column 3
etable(r_su_2018_nonarc)

#------------------------------------------------------------------------------#
### # - Linear Hypothesis Tests  – SU -- Table S14 - Panel C ---- 
-------------------------------------------------------------------------------
# Function to do linear hypothesis test + compute total effect + CI + interpretation
test_and_total_effect <- function(model, base, interaction, cluster_var) {
    # Extract cluster IDs
    cluster_ids <- model$model[[cluster_var]]
    
    # Get clustered vcov matrix
    vcov_clust <- vcovCL(model, cluster = cluster_ids)
    
    # Coefs and names
    coefs <- coef(model)
    coef_names <- names(coefs)
    
    # Make sure terms exist
    if (!(base %in% coef_names)) stop(paste("Base term", base, "not in coefficients"))
    if (!(interaction %in% coef_names)) stop(paste("Interaction term", interaction, "not in coefficients"))
    
    # Construct hypothesis matrix L for Wald test: sum of base + interaction = 0
    L <- rep(0, length(coefs))
    names(L) <- coef_names
    L[base] <- 1
    L[interaction] <- 1
    
    # Compute estimate and SE
    est <- sum(L * coefs)
    var <- as.numeric(t(L) %*% vcov_clust %*% L)
    se <- sqrt(var)
    
    # Wald test statistic (chi-square with df=1)
    wald_stat <- (est^2) / var
    pval <- 1 - pchisq(wald_stat, df = 1)
    
    # 95% CI
    ci_low <- est - 1.96 * se
    ci_high <- est + 1.96 * se
    
    # Interpretation
    interpret <- ifelse(pval < 0.05, "Reject null — significant", "Fail to reject null")
    
    # Return a data.frame row
    data.frame(
      Estimate = round(est, 4),
      Std_Error = round(se, 4),
      CI_Lower = round(ci_low, 4),
      CI_Upper = round(ci_high, 4),
      Wald_ChiSq = round(wald_stat, 3),
      P_value = round(pval, 4),
      Interpretation = interpret,
      row.names = NULL
    )
  }

# Storing all models in a named list
models <- list(
  r_su_2012 = r_su_2012,
  r_su_2018 = r_su_2018,
  r_su_2012_arc = r_su_2012_arc,
  r_su_2018_arc = r_su_2018_arc,
  r_su_2012_nonarc = r_su_2012_nonarc,
  r_su_2018_nonarc = r_su_2018_nonarc
)

# Storing corresponding hypotheses for each model
hypotheses <- list(
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018"),
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018"),
  c("rdd", "rdd:post_2012"),
  c("rdd", "rdd:post_2018")
)

# This is the variable used for cluster-robust standard errors
cluster_var <- "near_pa_FID"

# Initializing a list to store results from each mode
results_list <- list()

# Loop through all models and hypotheses
for(i in seq_along(models)) {
  mod_name <- names(models)[i]
  model <- models[[i]]
  base <- hypotheses[[i]][1]
  interaction <- hypotheses[[i]][2]
  
  res <- test_and_total_effect(model, base, interaction, cluster_var)
  res$Model <- mod_name
  res$Hypothesis <- paste0(base, " + ", interaction, " = 0")
  
  results_list[[i]] <- res[, c("Model", "Hypothesis", names(res)[1:7])]
}

# Combining all results into a single data frame
results_ht_su <- do.call(rbind, results_list)

##  Rename Model and Hypothesis labels (Table S9 - Panel B)
model_names_map <- c(
  r_su_2012 = "Post 2012",
  r_su_2018 = "Post 2018",
  r_su_2012_arc = "Arc - Post-2012",
  r_su_2018_arc = "Arc - Post-2018",
  r_su_2012_nonarc = "Non-Arc - Post-2012",
  r_su_2018_nonarc = "Non-Arc - Post-2018"
)

hypothesis_names_map <- c(
  "rdd + rdd:post_2012 = 0" = "B1 + B3 = 0",
  "rdd + rdd:post_2018 = 0" = "B1 + B3 = 0"
)

# Replace Model names in the results table
results_ht_su$Model <- model_names_map[results_ht_su$Model]
results_ht_su$Hypothesis <- hypothesis_names_map[results_ht_su$Hypothesis]

# View the renamed results - Table S9 - SU - Panel C
print(results_ht_su)


#------------------------------------------------------------------------------#
### - ### Robustness Check - Pre-Trends ----  – SUs -- Figure S16 ----
#-------------------------------------------------------------------------------
# Computing mean deforestation by year and treatment status
# 'su_hom_inside' indicates whether the area is inside or outside the SP boundary
mean_by_year <- p_su %>%
                group_by(year, rdd) %>%
                summarise(mean_estimate = mean(deforestation_pct, na.rm = TRUE),  # mean deforestation rate
                .groups = "drop")

# Creating a descriptive label for each group (inside vs. outside SP)
mean_by_year$group <- ifelse(mean_by_year$rdd == 1, "Inside SU", "Outside SU")

# Plotting mean deforestation trends over time for both groups
fig_trends_su <- ggplot(mean_by_year, aes(x = year, y = mean_estimate, color = group)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  # Vertical lines for key years
  geom_vline(xintercept = 2012, linetype = "dashed", color = "red") +
  geom_vline(xintercept = 2018, linetype = "dashed", color = "red") +
  annotate("text", x = 2012, y = max(mean_by_year$mean_estimate) * 0.8,
          label = "Forest Code", angle = 90, vjust = -0.5, size = 4.5, color = "red") +
  annotate("text", x = 2019, y = max(mean_by_year$mean_estimate) * 0.75,
          label = "Bolsonaro Elected", angle = 90, vjust = -1, size = 4.5, color = "red") +
  scale_x_continuous(
    breaks = seq(min(mean_by_year$year), max(mean_by_year$year), by = 3),
    labels = as.character(seq(min(mean_by_year$year), max(mean_by_year$year), by = 3))
  ) +
  ylab(NULL) +
  labs(
    title = "Deforestation Trends",
    x = "Year",
    y = "Mean Deforestation (%)",
    color = "Location"
  ) +
  scale_color_manual(values = c("Inside SU" = "black", "Outside SU" = "darkgray")) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 13, margin = margin(b = 10)),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_text(size = 13),
    legend.text = element_text(size = 12),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_line(color = "grey90")
  )

fig_trends_su
